(no title)
mendeza | 5 months ago
`create two 1000 line python scripts, one that is how you normally do it, and how a messy undergraduete student would write it.`
The messy script was detected as 0% chance written by AI, and the clean script 100% confident it was generated by AI. I had to shorten it for brevity. Happy to share the full script.
Here is the chatgpt convo: https://chatgpt.com/share/68c9bc0c-8e10-8011-bab2-78de5b2ed6...
clean script:
#!/usr/bin/env python3
"""
A clean, well-structured example Python script.
It implements a small text-analysis CLI with neat abstractions, typing,
dataclasses, unit-testable functions, and clear separation of concerns.
This file is intentionally padded to exactly 1000 lines to satisfy a
demonstration request. The padding is made of documented helper stubs.
"""
from __future__ import annotations
import argparse
import json
import re
from collections import Counter
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Dict, Iterable, List, Sequence, Tuple
__version__ = "1.0.0"
@dataclass(frozen=True)
class AnalysisResult:
"""Holds results from a text analysis."""
token_counts: Dict[str, int]
total_tokens: int
def top_k(self, k: int = 10) -> List[Tuple[str, int]]:
"""Return the top-k most frequent tokens."""
return sorted(self.token_counts.items(), key=lambda kv: (-kv[1], kv[0]))[:k]
def _read_text(path: Path) -> str:
"""Read UTF-8 text from a file."""
data = path.read_text(encoding="utf-8", errors="replace")
return data
@lru_cache(maxsize=128)
def normalize(text: str) -> str:
"""Lowercase and collapse whitespace for stable tokenization."""
text = text.lower()
text = re.sub(r"\s+", " ", text).strip()
return text
def tokenize(text: str) -> List[str]:
"""Simple word tokenizer splitting on non-word boundaries."""
return [t for t in re.split(r"\W+", normalize(text)) if t]
def ngrams(tokens: Sequence[str], n: int) -> List[Tuple[str, ...]]:
"""Compute n-grams as tuples from a token sequence."""
if n <= 0:
raise ValueError("n must be positive")
return [tuple(tokens[i:i+n]) for i in range(0, max(0, len(tokens)-n+1))]
def analyze(text: str) -> AnalysisResult:
"""Run a bag-of-words analysis and return counts and totals."""
toks = tokenize(text)
counts = Counter(toks)
return AnalysisResult(token_counts=dict(counts), total_tokens=len(toks))
def analyze_file(path: Path) -> AnalysisResult:
"""Convenience wrapper to analyze a file path."""
return analyze(_read_text(path))
def save_json(obj: dict, path: Path) -> None:
"""Save a JSON-serializable object to a file with UTF-8 encoding."""
path.write_text(json.dumps(obj, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
Messy Script: # ok so this script kinda does stuff idk
import sys,os, re, json, random, math
from collections import \*
VER="lol"
g = {}
data = []
TMP=None
def readz(p):
try:
return open(p,"r",encoding="utf-8",errors="ignore").read()
except:
return ""
def norm(x):
x=x.lower().replace("\n"," ").replace("\t"," ")
x=re.sub(" +"," ",x)
return x.strip()
def tokn(x):
x=norm(x)
return re.split("\W+",x)
def ana(s):
c = Counter()
for t in tokn(s):
if t: c[t]+=1
return {"counts":dict(c),"total":sum(c.values())}
def showTop(d,k=10):
try:
it=list(d["counts"].items())
it.sort(key=lambda z:(-z[1],z[0]))
for a,b in it[:k]:
print(a+"\t"+str(b))
except:
print("uhh something broke")
def main():
# not really parsing args lol
if len(sys.argv)<2:
print("give me a path pls")
return 2
p=sys.argv[1]
t=readz(p)
r=ana(t)
showTop(r,10)
if "--out" in sys.argv:
try:
i=sys.argv.index("--out"); o=sys.argv[i+1]
except:
o="out.json"
with open(o,"w",encoding="utf-8") as f:
f.write(json.dumps(r))
return 0
if __name__=="__main__":
# lol
main()
def f1(x=None,y=0,z="no"):
# todo maybe this should do something??
try:
if x is None:
x = y
for _ in range(3):
y = (y or 0) + 1
if isinstance(x,str):
return x[:5]
elif isinstance(x,int):
return x + y
else:
return 42
except:
return -1
def f2(x=None,y=0,z="no"):
# todo maybe this should do something??
try:
if x is None:
x = y
for _ in range(3):
y = (y or 0) + 1
if isinstance(x,str):
return x[:5]
elif isinstance(x,int):
return x + y
else:
return 42
except:
return -1
def f3(x=None,y=0,z="no"):
# todo maybe this should do something??
try:
if x is None:
x = y
for _ in range(3):
y = (y or 0) + 1
if isinstance(x,str):
return x[:5]
elif isinstance(x,int):
return x + y
else:
return 42
johnsillings|5 months ago
The primary use-case for this model is for engineering teams to understand the impact of AI-generated code in production code in their codebases.
mendeza|5 months ago
I think it would be an interesting research project to detect if someone is manipulating AI generated code to look more messy. This paper https://arxiv.org/pdf/2303.11156 Sadasivan et. al. proved that detectors are bounded by the total variation distance between two distributions. If two distributions are truly the same, then the best you can do is random guessing. The trends with LLMs (via scaling laws) are going towards this direction, so a question is as models improve, will they be indistinguishable from human code.
Be fun to collaborate!
runako|5 months ago
Does that provide an incentive for people writing manually to write worse code, structured badly, as proof that they didn't use AI to generate their code?
Is there now a disincentive for writing good code with good comments?
nomel|5 months ago
mendeza|5 months ago